Text classification using latent factors


In [2]:
from __future__ import print_function

from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)


Sklearn version: 0.18.1

Data


In [4]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train',
                 remove=('headers', 'footers', 'quotes'),
                 categories=categories, shuffle=True, random_state=42)

twenty_train.target_names


Out[4]:
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [5]:
print(twenty_train.data[0])
print('Target: ', twenty_train.target[0])


Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
Target:  1

In [6]:
# Text preprocessing, tokenizing and filtering of stopwords

from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=5000,
                                stop_words='english')
X_train_counts = tf_vectorizer.fit_transform(twenty_train.data)
X_train_counts.shape


Out[6]:
(2257, 5000)

In [8]:
#From occurrences to frequencies
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_train_tfidf.shape


Out[8]:
(2257, 5000)

In [ ]:

Reduce dimension


In [21]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

n_topics = 6
n_top_words = 20

lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(X_train_counts)


Out[21]:
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=1, n_topics=6, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [22]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topic #0:
church pope catholic marriage authority married orthodox canon schism mass liturgy bishop ceremony st churches catholics does priest jurisdiction coptic
Topic #1:
image file jpeg use program files images gif color know format does thanks graphics software using version bit available like
Topic #2:
edu com graphics mail send pub keyboard ftp data computer information cs systems software ca faq available gov contact pc
Topic #3:
god people think don jesus just does believe know say like time bible way things good true life christian question
Topic #4:
health use medical years people disease food msg new patients like don doctor research time 1993 10 day know just
Topic #5:
banks gordon skepticism edu soon pitt geb intellect chastity n3jxp dsl shameful cadre surrender father spirit son holy int col


In [23]:
X_trn_topics = lda.transform(X_train_counts)
X_trn_topics.shape


Out[23]:
(2257, 6)

In [ ]:

Classify


In [ ]:
from sklearn.neighbors import RadiusNeighborsClassifier
neigh = RadiusNeighborsClassifier(radius=1.0)
neigh.fit(X_trn_topics, twenty_train.target)

In [ ]:


In [ ]:

Pipelines


In [30]:
text_lda_knn = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
                         ('lda', LatentDirichletAllocation(n_topics=200, max_iter=25,
                                 learning_method='online',
                                 learning_offset=200.,
                                 random_state=0)),
                         ('clf', RadiusNeighborsClassifier(radius=1.0)),
                        ])

                         
_ = text_lda_knn.fit(twenty_train.data, twenty_train.target)

predicted = text_lda_knn.predict(docs_test)
np.mean(predicted == twenty_test.target)


Out[30]:
0.63049267643142481

In [ ]:
text_lda_sdg = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
                         ('lda', LatentDirichletAllocation(n_topics=200, max_iter=25,
                                 learning_method='online',
                                 learning_offset=200.,
                                 random_state=0)),
                         ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                               alpha=1e-3, n_iter=25, random_state=42)),
                        ])

                         
_ = text_lda_sdg.fit(twenty_train.data, twenty_train.target)

predicted = text_lda_sdg.predict(docs_test)
np.mean(predicted == twenty_test.target)

In [ ]:

Evaluate


In [12]:
# Score 2 new docs
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = tf_vectorizer.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))


'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics

In [19]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))


                        precision    recall  f1-score   support

           alt.atheism       0.74      0.61      0.67       319
         comp.graphics       0.81      0.92      0.86       389
               sci.med       0.86      0.84      0.85       396
soc.religion.christian       0.80      0.84      0.82       398

           avg / total       0.81      0.81      0.81      1502


In [20]:
metrics.confusion_matrix(twenty_test.target, predicted)


Out[20]:
array([[194,  25,  30,  70],
       [ 14, 356,  16,   3],
       [ 17,  35, 332,  12],
       [ 36,  22,   7, 333]])

In [ ]:


In [ ]: